*****************************************************************************************
** TPG Replication: Vote Your Region or Your Income? Decomposing Variance in Redistributive Voting 
** Stata Do-file Created by Dong Wook Lee & Melisas Rogers  
** Software Version: StataNow/SE 18.5 for Windows [64-bit x86-64]
** Table 1. Economic Effects on Redistributive Voting Behavior            
*****************************************************************************************

** Multiple datasets are combined: 
** 1) Comparative Study of Electoral Systems (CSES) -- Integrated Module 1 through 4 
** 2) Comparative Manifesto Project (CMP) 
** 3) OECD's Regional Economy Statistics

******************************************************************************
** Data Loading for Vote Choice Analysis Using a Continous Measure (Model 1 through Model 3         ** 
******************************************************************************
*Setting a working directory
clear all
cd "G:\My Drive\companion paper with Melissa\TPG Final Submission (Replication)\Figure 2"

*Data Loading : Electoral District = TL3 Matching Case Only*
use "masterdata_electoral_district.dta"

*******************************
* Dependent Variable
*******************************
/*Postive Value: Parties with Less Redistributive Economic Policies & Negative Value: Parties with More Redistributive Economic Policies */
/*Combined measures of a party's policy position (economic topics)*/
gen rile_econ = log((per403 + per404 +  per406 + per412 + per413 + per504 + per506 + per701 + 0.5)/(per401 + per402 + per407 + per414 + per505 +0.5))
drop if rile_econ==.

*******************************
*Key Independnet Variables
*******************************
*Regional productivity qunitile (Used the per capita GDP of an electoral district, drawn from the OECD Regional Economy Statistics) 
tab rinc_quintile_t1, missing
tab rinc_quintile_t1, gen(rgroup_)

*Household income quintile
tab IMD2006, missing
recode IMD2006 (7 8 9 =.), gen(IMD2006_rev)
tab IMD2006_rev, missing 
drop if rinc_quintile_t1==. | IMD2006_rev==. 
tab country, missing

*Renaming Key Independent Variables
rename rinc_quintile_t1 rinc_quintile     /*LOW: Poor, High: Rich*/
rename IMD2006_rev hinc_quintile          /*LOW: Poor, High: Rich*/ 

********************************
*Individual-Level Control Variables
********************************
*Self-identification: Right (0)-Left (10)
recode IMD3006 (95 97 98 99 =.), gen(left_right)
recode left_right (0=10) (1=9) (2=8) (3=7) (4=6) (5=5) (6=4) (7=3) (8=2) (9=1) (10=0), gen(right_left)
*Age (16-95)
recode IMD2001_1 (9997 9999 =.), gen(age)
tab age
*Age squared
gen age_sq = age^2
*Male (1=Yes, 0=No)
recode IMD2002 (1=1) (2=0) (9=.), gen(male)
tab male    
*Higher education (1=Yes, 0=No)
recode IMD2003 (0=0) (1 2 = 0) (3 4 =1) (6 7 8 9 =.), gen(educationlevel)
tab educationlevel
*Married (1= Yes, 0=No)                  
recode IMD2004 (1 3 =1) (2 4 =0) (5 7 9 =.), gen(married)
tab married
*Religiosity (1=Yes, 0 = No)                         
recode IMD2005 (1 2 3 4 5 7 8 9 11 = 1) (12 13 96 97  = 0) ( 98 99 =.), gen(religiosity)
tab religiosity

***********************************
*Group-level Control Variables
***********************************
*voter turnout %
rename IMD5006_1 voteturnout
*country-specific election year
rename IMD1008_YEAR electionyear
*Assigning country id
egen countryid = group(country)

/*Country Name (Labeling)*/
tab countryid, gen(countryid_)
tab countryid country
rename countryid_1 austria
rename countryid_2 czech_republic
rename countryid_3 denmark
rename countryid_4 finland 
rename countryid_5 germany
rename countryid_6 greece
rename countryid_7 italy
rename countryid_8 latvia
rename countryid_9 norway
rename countryid_10 poland 
rename countryid_11 spain
rename countryid_12 switzerland
drop if denmark==1 | latvia==1 | norway==1    /*Due to missing observations*/
tab electionyear, gen(y_)

***************************************
** Regression Analysis: Model 1 from Table 1
*************************************** 
/*Two-stage estimate of β (OLS estimates)*/
reg rile_econ hinc_quintile rinc_quintile right_left age age_sq male educationlevel married religiosity czech_republic finland germany greece italy poland spain switzerland y_2-y_15
predict k if e(sample), xb
gen resid = rile_econ-k
egen stresid=std(resid)
gen outlier = 0 if e(sample)
replace outlier = 1 if abs(stresid)>1.5
reg rile_econ hinc_quintile rinc_quintile right_left age age_sq male educationlevel married religiosity czech_republic finland germany greece italy poland spain switzerland y_2-y_15 if outlier!=1
predict R, r                                                                                            /*OLS residual*/
gen R2=R^2                                                                                              /*squared residuals for glm fit*/

glm R2 hinc_quintile rinc_quintile voteturnout, family(gamma) link(log)                                 /*gamma reg on log(r2)*/
predict S2, mu                                                                                          /*fitted variances, exp(xb)*/
gen LOGLIK = -.5*(ln(S2)+(R2/S2))                                                                                                                                /*evaluating log likelihood*/
egen LL0 = sum(LOGLIK)                                                                                  /*summing log likelihood*/
display LL0

/*Updating beta and lambda coefficients*/
gen DLL=1                                                                                              /*initialize change in loglik*/
while DLL> 0.00001{                                    
drop R k resid stresid outlier                                                                                        
reg rile_econ hinc_quintile rinc_quintile right_left age age_sq male educationlevel married religiosity czech_republic finland germany greece italy poland spain switzerland y_2-y_15 [aw=1/S2] /*WLS with variances as weights*/
predict k if e(sample), xb
gen resid = rile_econ-k
egen stresid=std(resid)
gen outlier = 0 if e(sample)
replace outlier = 1 if abs(stresid)>1.5
reg rile_econ hinc_quintile rinc_quintile right_left age age_sq male educationlevel married religiosity czech_republic finland germany greece italy poland spain switzerland y_2-y_15 [aw=1/S2] if outlier!=1 /*WLS with variances as weights*/
drop S2
predict R, r                                                                                           /*WLS residuals*/
replace R2 = R^2                                                                                       /*squared residuals for glm fit*/
est store BETA                                                                                         /*saving beta coefficient (hinc_qunitile)*/
glm R2 hinc_quintile rinc_quintile voteturnout, family(gamma) link(log)                                /*gamma reg on log(r2)*/
predict S2, mu                                                                                         /*fitted variances, exp(Xb)*/
est store LAMBDA                                                                                       /*saving lambda coefficeints (rinc_quntile)*/
replace LOGLIK = -.5*(ln(S2)+(R2/S2))                                                                                                                       /*evaluating log likelihood*/
egen LLN = sum(LOGLIK)                                                                                 /*summing log likelihood*/
di LLN
replace DLL=LLN-LL0                                                                                    /*assess convergence*/
replace LL0=LLN      
drop LLN
}

est table BETA LAMBDA, b star(.1 .05 .01) stats(N ll aic bic)                                          /*table with coefficeints, P<|Z|, and se's*/
est table BETA LAMBDA, b se stats(N ll aic bic) b(%7.3f) se(%7.3f)

****************************************
** After Update (The final regression to run)
****************************************
reg rile_econ hinc_quintile rinc_quintile right_left age age_sq male educationlevel married religiosity czech_republic finland germany greece italy poland spain switzerland y_2-y_15 [aw=1/S2] /*WLS with variances as weights*/
adjust hinc_quintile=1 rinc_quintile=1 if e(sample), gen(h1r1_b) xb
adjust hinc_quintile=1 rinc_quintile=5 if e(sample), gen(h1r5_b) xb
adjust hinc_quintile=5 rinc_quintile=1 if e(sample), gen(h5r1_b) xb
adjust hinc_quintile=5 rinc_quintile=5 if e(sample), gen(h5r5_b) xb

svyset countryid, strata(electionyear)
estpost svy: tabulate hinc_quintile if rinc_quintile ==1
matrix list e(b)
matrix b=e(b)
scalar PP_b=b[1,1]
sum rile_econ if e(sample) & rinc_quintile == 1
gen between_h1r1 = (h1r1_b-`r(mean)')^2*PP_b
sum between_h1r1 if hinc_quintile==1 & rinc_quintile==1

estpost svy: tabulate hinc_quintile if rinc_quintile ==1
matrix list e(b)
matrix b=e(b)
scalar RP_b=b[1,5]
sum rile_econ if e(sample) & rinc_quintile == 1
gen between_h5r1 = (h5r1_b-`r(mean)')^2*RP_b
sum between_h5r1 if hinc_quintile==5 & rinc_quintile==1

estpost svy: tabulate hinc_quintile if rinc_quintile ==5
matrix list e(b)
matrix b=e(b)
scalar PR_b=b[1,1]
sum rile_econ if e(sample) & rinc_quintile == 5
gen between_h1r5 = (h1r5_b-`r(mean)')^2*PR_b
sum between_h1r5 if hinc_quintile==1 & rinc_quintile==5

estpost svy: tabulate hinc_quintile if rinc_quintile ==5
matrix list e(b)
matrix b=e(b)
scalar RR_b=b[1,5]
sum rile_econ if e(sample) & rinc_quintile == 5
gen between_h5r5 = (h5r5_b-`r(mean)')^2*RR_b
sum between_h5r5 if hinc_quintile==5 & rinc_quintile==5

**************************************************
** Summary of Between Vsariance 
**************************************************
sum between_h1r1 if hinc_quintile==1 & rinc_quintile==1      /*Between Variance: Poor People in Poor Regions*/
sum between_h5r1 if hinc_quintile==5 & rinc_quintile==1      /*Between Variance: Rich People in Poor Regions*/
sum between_h1r5 if hinc_quintile==1 & rinc_quintile==5      /*Between Variance: Poor People in Rich Regions*/ 
sum between_h5r5 if hinc_quintile==5 & rinc_quintile==5      /*Between Variance: Poor People in Rich Regions*/ 

glm R2 hinc_quintile rinc_quintile voteturnout, family(gamma) link(log)                    /*gamma reg on log(r2)*/
adjust hinc_quintile=1 rinc_quintile=1 if e(sample), gen(h1r1_w) xb
adjust hinc_quintile=1 rinc_quintile=5 if e(sample), gen(h1r5_w) xb
adjust hinc_quintile=5 rinc_quintile=1 if e(sample), gen(h5r1_w) xb
adjust hinc_quintile=5 rinc_quintile=5 if e(sample), gen(h5r5_w) xb

svyset countryid, strata(electionyear)
estpost svy: tabulate hinc_quintile if rinc_quintile ==1
matrix list e(b)
matrix b=e(b)
scalar PP_w=b[1,1]
gen within_h1r1 = exp(h1r1_w)*PP_w if e(sample) &  rinc_quintile == 1
sum within_h1r1 if hinc_quintile==1 & rinc_quintile==1

estpost svy: tabulate hinc_quintile if rinc_quintile ==1
matrix list e(b)
matrix b=e(b)
scalar RP_w=b[1,5]
gen within_h5r1 = exp(h5r1_w)*RP_w if e(sample) &  rinc_quintile == 1
sum within_h5r1 if hinc_quintile==5 & rinc_quintile==1

estpost svy: tabulate hinc_quintile if rinc_quintile ==5
matrix list e(b)
matrix b=e(b)
scalar PR_w=b[1,1]
gen within_h1r5 = exp(h5r1_w)*PR_w if e(sample) & rinc_quintile == 5
sum within_h1r5 if hinc_quintile==1 & rinc_quintile==5

estpost svy: tabulate hinc_quintile if rinc_quintile ==5
matrix list e(b)
matrix b=e(b)
scalar RR_w=b[1,5]
gen within_h5r5 = exp(h1r5_w)*RR_w if e(sample) & rinc_quintile == 5
sum within_h5r5 if hinc_quintile==5 & rinc_quintile==5

***********************************************
** Summary of Within Vaariance  
***********************************************
sum within_h1r1 if hinc_quintile==1 & rinc_quintile==1      /*Within Variance: Poor People in Poor Regions*/
sum within_h5r1 if hinc_quintile==5 & rinc_quintile==1      /*Wihitn Variance: Rich People in Poor Regions*/
sum within_h1r5 if hinc_quintile==1 & rinc_quintile==5      /*Within Variance: Poor People in Rich Regions*/
sum within_h5r5 if hinc_quintile==5 & rinc_quintile==5      /*Within Variance: Poor People in Rich Regions*/

************************************************
** Summary of total variance (between + within) 
************************************************
gen total_h1r1 = within_h1r1 + between_h1r1 if hinc_quintile==1 & rinc_quintile==1  
gen total_h5r1 = within_h5r1 + between_h5r1 if hinc_quintile==5 & rinc_quintile==1  
gen total_h1r5 = within_h1r5 + between_h1r5 if hinc_quintile==1 & rinc_quintile==5  
gen total_h5r5 = within_h5r5 + between_h5r5 if hinc_quintile==5 & rinc_quintile==5  

********************************
** With 95 Conflidence Intervalus 
********************************
ci mean total_h1r1 
scalar PP_mean=`r(mean)'
scalar PP_lower = `r(lb)'
scalar PP_upper = `r(ub)'

ci mean total_h5r1 
scalar RP_mean=`r(mean)'
scalar RP_lower=`r(lb)'
scalar RP_upper=`r(ub)'

ci mean total_h1r5 
scalar PR_mean=`r(mean)'
scalar PR_lower=`r(lb)'
scalar PR_upper=`r(ub)'

ci mean total_h5r5
scalar RR_mean=`r(mean)'
scalar RR_lower=`r(lb)'
scalar RR_upper=`r(ub)'

sum rile_econ if e(sample)    /*CSES Sampled Used for Variance Function Regression as Whole*/
scalar PP_mean_perc = PP_mean/`r(Var)'*100
scalar PP_lower_perc = PP_lower/`r(Var)'*100
scalar PP_upper_perc = PP_upper/`r(Var)'*100

scalar RP_mean_perc = RP_mean/`r(Var)'*100
scalar RP_lower_perc = RP_lower/`r(Var)'*100
scalar RP_upper_perc = RP_upper/`r(Var)'*100

scalar PR_mean_perc = PR_mean/`r(Var)'*100
scalar PR_lower_perc = PR_lower/`r(Var)'*100
scalar PR_upper_perc = PR_upper/`r(Var)'*100

scalar RR_mean_perc = RR_mean/`r(Var)'*100
scalar RR_lower_perc = RR_lower/`r(Var)'*100
scalar RR_upper_perc = RR_upper/`r(Var)'*100

display PP_lower_perc 
display PP_mean_perc 
display PP_upper_perc

display RP_lower_perc 
display RP_mean_perc 
display RP_upper_perc

display PR_lower_perc 
display PR_mean_perc 
display PR_upper_perc

display RR_lower_perc 
display RR_mean_perc 
display RR_upper_perc

***********************************
** Graph (with 95 confidence intervals) 
***********************************
frame rename default votechoice   /*with Stata 16*/
frame create votechoice_variance
frame change votechoice_variance
gen average =. 
gen lowerci =. 
gen upperci =. 
gen category=. 
set obs 1
replace category = 1 in 1     /*PP*/
set obs 2
replace category = 2 in 2     /*RP*/
set obs 3
replace category = 3 in 3     /*PR*/
set obs 4
replace category = 4 in 4     /*RR*/

replace average = PP_mean_perc if category == 1
replace average = RP_mean_perc if category == 2
replace average = PR_mean_perc if category == 3
replace average = RR_mean_perc if category == 4

replace lowerci = PP_lower_perc if category == 1
replace lowerci = RP_lower_perc if category == 2
replace lowerci = PR_lower_perc if category == 3
replace lowerci = RR_lower_perc if category == 4

replace upperci = PP_upper_perc if category == 1
replace upperci = RP_upper_perc if category == 2
replace upperci = PR_upper_perc if category == 3
replace upperci = RR_upper_perc if category == 4


*****************************************************
** Figure 2: Weighted Total Vote Choice Variance of Selected Groups
*****************************************************
eclplot average lowerci upperci category, ylabel(, nogrid) ///
ytitle(Percentage of Vote Choice Sample Variability) ysc(r(10 30)) ///
xsize(5.5) ysize(5) xsc(r(0 5)) xtitle("") xlabel("") ///
play("figure2_modification.grec") saving("figure2.gph", replace)
